# Clean up APNs for fires in other states
pacman::p_load(stringdist, data.table, dplyr, fst)

## Preliminaries
rm(list = ls())

source("code/globals.R")

damagedata <- readRDS(file.path(WORKING, "damage_other_states.RDS"))

############################################################################
########### SEPARATE FILES FOR VARIOUS STATE BY TIME PERIOD SNAPSHOTS ######
############################################################################

segmentdamagedata <- function(stateabbr, startdatestring, enddatestring) {
  segment <- damagedata[State == stateabbr &
    date >= as.Date(startdatestring) &
    date < as.Date(enddatestring), ]
  ## confirm unique parcel ids
  stopifnot(anyDuplicated(segment[, c("FIPS", "UnformattedAssessorParcelNumber")]) == 0)
  return(segment)
}


#### Arizona ########################
damage_az_13 <- segmentdamagedata("AZ", "2013-01-01", "2014-01-01")
damage_az_17 <- segmentdamagedata("AZ", "2017-01-01", "2018-01-01")

##### Colorado #####################
damage_co_12 <- segmentdamagedata("CO", "2012-01-01", "2013-01-01")
damage_co_13 <- segmentdamagedata("CO", "2013-01-01", "2014-01-01")
damage_co_16 <- segmentdamagedata("CO", "2016-01-01", "2018-06-30")
damage_co_20 <- segmentdamagedata("CO", "2020-08-01", "2021-01-01")

##### Oregon #####################
damage_or_20 <- segmentdamagedata("OR", "2020-08-01", "2021-01-01")

##### Washington #####################
damage_wa_14 <- segmentdamagedata("WA", "2014-01-01", "2015-01-01")
damage_wa_15 <- segmentdamagedata("WA", "2015-01-01", "2016-01-01")
damage_wa_20 <- segmentdamagedata("WA", "2020-08-01", "2021-01-01")

### confirm these span all incidents in the data
stopifnot(nrow(damage_az_13) + nrow(damage_az_17) + nrow(damage_co_12) +
  nrow(damage_co_13) + nrow(damage_co_16) + nrow(damage_co_20) +
  nrow(damage_or_20) + nrow(damage_wa_14) + nrow(damage_wa_15) +
  nrow(damage_wa_20) ==
  nrow(damagedata))

###################################################################
### LOAD CORRESPONDING ZTRAX SNAPSHOTS ############################
###################################################################

## -- from the historical assessments data
loadhistatt <- function(stateabbr, year) {
  d <- read_fst(file.path(WORKING, "ztraxdata", paste0(stateabbr, "attr", year, ".fst")), as.data.table = TRUE)
  ## confirm unique ztrax ids
  stopifnot(anyDuplicated(d[, c("ImportParcelID", "BuildingOrImprovementNumber")]) == 0)
  return(d)
}

attr_az_13 <- loadhistatt("AZ", 2013)

attr_co_12 <- loadhistatt("CO", 2012)
attr_co_13 <- loadhistatt("CO", 2013)

attr_wa_14 <- loadhistatt("WA", 2014)
attr_wa_15 <- loadhistatt("WA", 2015)

## -- from the 2016 through 2020 data (fst files)
loadneweratt <- function(foldername, stateabbr, datayear) {
  d <- read_fst(file.path(WORKING, "ztraxdata", paste0(stateabbr, "attr", datayear, ".fst")), as.data.table = T)
  ## confirm unique ztrax ids
  stopifnot(anyDuplicated(d[, c("ImportParcelID", "BuildingOrImprovementNumber")]) == 0)
  return(d)
}

attr_az_16 <- loadneweratt("2016", "AZ", 2016)

attr_co_16 <- loadneweratt("2016", "CO", 2016)
attr_co_20 <- loadneweratt("20201012", "CO", 2020)

attr_or_20 <- loadneweratt("20201012", "OR", 2020)

attr_wa_20 <- loadneweratt("20201012", "WA", 2020)


## Function to inspect IDs by county in each dataset. Looks for approx string matches on address; then use that to compare APNs
inspectAPN <- function(targetfips, damagefile, attrfile) {
  dd <- damagefile[FIPS == targetfips, c("UnformattedAssessorParcelNumber", "Address", "NotesField")]
  dd$Address <- tolower(dd$Address)
  aa <- attrfile[FIPS == targetfips, c(
    "BuildingOrImprovementNumber", "UnformattedAssessorParcelNumber", "PropertyFullStreetAddress",
    "NoOfUnits", "PropertyLandUseStndCode"
  )]
  aa$address <- tolower(aa$PropertyFullStreetAddress)
  names(dd)[names(dd) == "UnformattedAssessorParcelNumber"] <- "UnformattedAPN_DINS"
  names(dd)[names(dd) == "Address"] <- "address_DINS"
  ## For 100 random fire records, find closest address string in ZTrax
  set.seed(16473)
  dd$rand <- runif(nrow(dd))
  dd <- dd[order(dd$rand), ]
  if (nrow(dd) > 100) {
    dd <- dd[1:100, ]
  }
  table <- data.frame(matrix(nrow = nrow(dd), ncol = 5, data = NA))

  for (i in seq(1, nrow(dd))) {
    bestmatch <- amatch(dd[i, address_DINS], aa$address, maxDist = 100, method = "dl")
    tablerow <- cbind(
      dd[i, c("UnformattedAPN_DINS", "address_DINS")],
      aa[bestmatch, c("PropertyFullStreetAddress", "UnformattedAssessorParcelNumber")]
    )
    tablerow$stringdist <- stringdist(tolower(tablerow$address_DINS), tolower(tablerow$PropertyFullStreetAddress), method = "dl")
    table[i, ] <- tablerow
  }

  names(table) <- names(tablerow)
  table <- table[, c(
    "stringdist", "UnformattedAssessorParcelNumber", "UnformattedAPN_DINS",
    "PropertyFullStreetAddress",
    "address_DINS"
  )]
  table <- table[order(table$stringdist), ]
  table$lengthUAPN_DINS <- nchar(table$UnformattedAPN_DINS)
  table$lengthUAPN_Ztrax <- nchar(table$UnformattedAssessorParcelNumber)
  return(table)
}

## Function to check how many string address matches are perfectly match on UAPN
checkAPN <- function(targetfips, damagefile, attrfile) {
  v <- inspectAPN(targetfips, damagefile, attrfile)
  v <- v[v$stringdist <= 10 & !is.na(v$stringdist), ]
  print(nrow(v[v$UnformattedAssessorParcelNumber == v$UnformattedAPN_DINS, ]) / nrow(v))
  v <- v[v$stringdist <= 8 & !is.na(v$stringdist), ]
  print(nrow(v[v$UnformattedAssessorParcelNumber == v$UnformattedAPN_DINS, ]) / nrow(v))
}



## --------- Arizona 2013
damage_az_13[, .N, by = FIPS]

# 04025 - no changes needed
inspectAPN(4025, damage_az_13, attr_az_13)
checkAPN(4025, damage_az_13, attr_az_13)

## ------ Arizona 2017
damage_az_17[, .N, by = FIPS]

# 04025 - no changes needed
damage_az_17[, .N, by = FIPS]
inspectAPN(4025, damage_az_17, attr_az_16)
checkAPN(4025, damage_az_17, attr_az_16)


## --------- Colorado 2012
damage_co_12[, .N, by = FIPS]

# 08041 - No changes needed
inspectAPN(8041, damage_co_12, attr_co_12)
checkAPN(8041, damage_co_12, attr_co_12)

# 08069 - No changes needed
inspectAPN(8069, damage_co_12, attr_co_12)
checkAPN(8069, damage_co_12, attr_co_12)


## --------- Colorado 2013
damage_co_13[, .N, by = FIPS]

# 08041 - No changes needed
inspectAPN(8041, damage_co_13, attr_co_13)
checkAPN(8041, damage_co_13, attr_co_13)



## --------- Colorado 2016--2018
damage_co_16[, .N, by = FIPS]

# 08041 - No changes needed
inspectAPN(8041, damage_co_16, attr_co_16)
checkAPN(8041, damage_co_16, attr_co_16)



## --------- Colorado 2020
damage_co_20[, .N, by = FIPS]

# 08069 - No changes needed
inspectAPN(8069, damage_co_20, attr_co_20)
checkAPN(8069, damage_co_20, attr_co_20)

# 08049 - No changes needed
inspectAPN(8049, damage_co_20, attr_co_20)
checkAPN(8049, damage_co_20, attr_co_20)


## --------- Oregon 2020
damage_or_20[, .N, by = FIPS]

# 41029 --- No changes needed but is this conversion of MAPLOT to APN valid? Check more carefully. Not ALL properties associated with a MapLot may be lost?
inspectAPN(41029, damage_or_20, attr_or_20)
checkAPN(41029, damage_or_20, attr_or_20)

# 41039
inspectAPN(41039, damage_or_20, attr_or_20)
checkAPN(41039, damage_or_20, attr_or_20)

# 41047 --- No changes needed
inspectAPN(41047, damage_or_20, attr_or_20)
checkAPN(41047, damage_or_20, attr_or_20)

# 41041 --- no changes needed
inspectAPN(41041, damage_or_20, attr_or_20)
checkAPN(41041, damage_or_20, attr_or_20)

## --------- Washington 2014
damage_wa_14[, .N, by = FIPS]

# 53047- no changes needed
inspectAPN(53047, damage_wa_20, attr_wa_20)

## --------- Washington 2015
damage_wa_15[, .N, by = FIPS]

# 53047- no changes needed
inspectAPN(53047, damage_wa_15, attr_wa_15)

## --------- Washington 2020
damage_wa_20[, .N, by = FIPS]

# 53047- no changes needed
inspectAPN(53047, damage_wa_20, attr_wa_20)


#------------------------------------------------------------------------------------------------------------------------------------------------------#
#------------------ Handle a problem on the ZTRAX side: APN does not uniquely identify homes in ZTRAX for some counties/years. ------------------------#
#------------------------------------------------------------------------------------------------------------------------------------------------------#
## This is limited to 2020 ZTRAX files so far. Not ideal, but it is a small share of total records that also appear in the damage data, which is the only place it matters.

damagefiles <- list(
  damage_az_13, damage_az_17,
  damage_co_12, damage_co_13, damage_co_16, damage_co_20,
  damage_or_20,
  damage_wa_14, damage_wa_15, damage_wa_20
)
attrfiles <- list(
  attr_az_13, attr_az_16,
  attr_co_12, attr_co_13, attr_co_16, attr_co_20,
  attr_or_20,
  attr_wa_14, attr_wa_15, attr_wa_20
)

## Which files does this affect? (In which ZTRAX files are FIPS+UAPN not a unique id after inner merge to damage data?).
#- looks like it's Colorado and Oregon 2020 files.
## - For Colorado county 8069, 12 records are present in multiples (out of 185 destroyed homes).
## - For Colorado county 8049 (Grand County, East Troublesome Fire), many parcels have multiple structures. BUT, the damage assessment data seem to only label things destroyed if the "main home" is destroyed.
for (p in seq_len(length(attrfiles))) {
  print(p)
  a <- merge(attrfiles[[p]], damagefiles[[p]], by = c("UnformattedAssessorParcelNumber", "FIPS"))
  print(anyDuplicated(a[, .(UnformattedAssessorParcelNumber, FIPS)]) == 0)
}
rm(damagefiles, attrfiles)

## --------------------------- Handle these duplicates -----------------------------#
## - I apply a heuristic to identify the "main" ZTRAX structure on each parcel that is reported as destroyed in the damage data (which is assumedly what the damage inspectors were considering).
## - This heuristic orders structures by number of bedrooms, and then square footage, and then Tax Amount, then breaks any remaining ties randomly.
## - This give highly similar results to just choosing "buildingorimprovementnumber" equal to 1 in all cases.
## Also, in most of these cases ALL the structures on the parcel are destroyed anyway. See "NumOfStruc" and related fields in Grand County damage data.

handle_ztrax_APN_dupes <- function(damagefile, attrfile) {
  a <- merge(attrfile, damagefile, by = c("UnformattedAssessorParcelNumber", "FIPS"))
  a[, Multiple := .N > 1, by = .(UnformattedAssessorParcelNumber, FIPS)]

  b <- a[Multiple == TRUE, ] %>%
    dplyr::select(
      FIPS, ImportParcelID, BuildingOrImprovementNumber, UnformattedAssessorParcelNumber, sqfeet, LotSizeAcres,
      TaxAmount, YearBuilt, TotalBedrooms, PropertyLandUseStndCode, PropertyFullStreetAddress
    ) %>%
    dplyr::mutate(sqfeet = replace(sqfeet, is.na(sqfeet), 0)) %>% # replace NA values with 0 so they don't sort to the bottom in each group (important below when sorting desired obs to end)
    dplyr::mutate(TotalBedrooms = replace(TotalBedrooms, is.na(TotalBedrooms), 0)) %>%
    dplyr::mutate(TaxAmount = replace(TaxAmount, is.na(TaxAmount), 0)) %>%
    dplyr::mutate(rand = runif(n())) %>% # random tiebreaker. Note this is reproducible because we set seed in globals.R (which gets called at top of script)
    dplyr::arrange(FIPS, UnformattedAssessorParcelNumber, TotalBedrooms, sqfeet, TaxAmount, rand) %>%
    group_by(FIPS, UnformattedAssessorParcelNumber) %>%
    dplyr::mutate(copy = row_number(), numcopies = n()) %>%
    ungroup()

  #-- make a list of rows to be deleted (all except the LAST row in each sorted group)
  b <- b %>%
    dplyr::filter(copy != numcopies) %>%
    dplyr::select(ImportParcelID, BuildingOrImprovementNumber) %>%
    dplyr::mutate(tempconcat = paste0(ImportParcelID, "---j---", BuildingOrImprovementNumber))

  ## -- Remove duplicates from attribute data
  oldrows <- nrow(attrfile)
  attrfile <- attrfile %>%
    dplyr::mutate(tempconcat = paste0(ImportParcelID, "---j---", BuildingOrImprovementNumber)) %>%
    dplyr::filter(!(tempconcat %in% b$tempconcat)) %>%
    dplyr::mutate(tempconcat = NULL)

  print(paste0(oldrows - nrow(attrfile), " rows deleted"))
  return(attrfile)
}

## -- Apply this function where needed
attr_co_20 <- handle_ztrax_APN_dupes(damage_co_20, attr_co_20)
attr_or_20 <- handle_ztrax_APN_dupes(damage_or_20, attr_or_20)

#########################################################
############ Save the cleaned files #####################
#########################################################
#- set up 2 big NAMED lists of data frames to save

cleandamagelist <- list(
  damage_az_13 = damage_az_13,
  damage_az_17 = damage_az_17,
  damage_co_12 = damage_co_12,
  damage_co_13 = damage_co_13,
  damage_co_16 = damage_co_16,
  damage_co_20 = damage_co_20,
  damage_or_20 = damage_or_20,
  damage_wa_14 = damage_wa_14,
  damage_wa_15 = damage_wa_15,
  damage_wa_20 = damage_wa_20
)

cleanattrlist <- list(
  attr_az_13 = attr_az_13,
  attr_az_16 = attr_az_16,
  attr_co_12 = attr_co_12,
  attr_co_13 = attr_co_13,
  attr_co_16 = attr_co_16,
  attr_co_20 = attr_co_20,
  attr_or_20 = attr_or_20,
  attr_wa_14 = attr_wa_14,
  attr_wa_15 = attr_wa_15,
  attr_wa_20 = attr_wa_20
)

#--- Limit ZTRAX data to just counties in damage data
limitztrax <- function(damagefile, attrfile) {
  y <- attrfile[attrfile$FIPS %in% unique(damagefile$FIPS), ]
  return(y)
}
for (i in seq_len(length(cleanattrlist))) {
  cleanattrlist[[i]] <- limitztrax(cleandamagelist[[i]], cleanattrlist[[i]])
}
rm(i)

save_clean_attr_data <- function(filenum) {
  write_fst(cleanattrlist[[filenum]], file.path(WORKING, paste0(names(cleanattrlist)[filenum], "_cleaned.fst")))
}

lapply(
  seq_len(length(cleanattrlist)),
  save_clean_attr_data
)

save_clean_damage_data <- function(filenum) {
  write_fst(cleandamagelist[[filenum]], file.path(WORKING, paste0(names(cleandamagelist)[filenum], "_cleaned.fst")))
}

lapply(
  seq_len(length(cleandamagelist)),
  save_clean_damage_data
)
